We used R version 4.4.2 [@base] and the following R packages: AnnotationDbi v. 1.66.0 [@AnnotationDbi], BiocManager v. 1.30.23 [@BiocManager], DESeq2 v. 1.44.0 [@DESeq2], GGally v. 2.2.1 [@GGally], ggcorrplot v. 0.1.4.1 [@ggcorrplot], ggpubr v. 0.6.0 [@ggpubr], ggtext v. 0.1.2 [@ggtext], here v. 1.0.1 [@here], hexbin v. 1.28.3 [@hexbin], knitr v. 1.48 [@knitr2014; @knitr2015; @knitr2024], pheatmap v. 1.0.12 [@pheatmap], renv v. 1.0.7 [@renv], rmarkdown v. 2.27 [@rmarkdown2018; @rmarkdown2020; @rmarkdown2024], scales v. 1.3.0 [@scales], svglite v. 2.1.3 [@svglite], tidyverse v. 2.0.0 [@tidyverse], txdbmaker v. 1.0.1 [@txdbmaker], tximport v. 1.32.0 [@tximport], UpSetR v. 1.4.0 [@UpSetR], VennDiagram v. 1.7.3 [@VennDiagram], viridis v. 0.6.5 [@viridis], vsn v. 3.72.0 [@vsn].
The RDS objects required for this report can be generated using the data import script.
If the RDS objects have already been created in a previous run, the following cell can be uncommented and run, and the remaining cells in the data import section can be skipped. See RDS object section).
dir.create(here("results/figures/"), showWarnings = FALSE)
Number of genes per species:
gene_counts_txi %>%
group_by(species) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(total = sum(n), rel.freq = n / total)
Number of genes per species after filtering low counts (n < 10 for all samples):
gene_counts_txi_filtered %>%
group_by(species) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(total = sum(n), rel.freq = n / total)
Number of gene types per species:
gene_counts_txi %>%
group_by(species, gene_type) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(total = sum(n), rel.freq = round(n / total, 5))
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
Number of gene types per species after filtering out low counts (n < 10 for all samples):
gene_counts_txi_filtered %>%
group_by(species, gene_type) %>%
summarise(n = n()) %>%
ungroup() %>%
mutate(total = sum(n), rel.freq = round(n / total, 5))
## `summarise()` has grouped output by 'species'. You can override using the
## `.groups` argument.
n_genes <- dim(gene_counts_txi)[1]
# stopifnot(dim(gene_counts_txi)[1] == (gene_counts_salmon %>% group_by(sample) %>% summarise(n = n()) %>% select(n) %>% unique()))
n_genes_filtered <- dim(gse_filtered)[1]
n_human_genes <- gene_counts_txi %>%
filter(species == "human") %>%
count() %>%
pull()
n_human_genes_filtered <- gene_counts_txi_filtered %>%
filter(species == "human") %>%
count() %>%
pull()
n_pk_genes <- dim(gse_pk)[1]
stopifnot(dim(gse_pk)[1] == (gene_counts_txi %>% filter(species == "pk") %>% count()))
n_pk_genes_filtered <- dim(gse_pk_filtered)[1]
stopifnot(dim(gse_pk_filtered)[1] == (gene_counts_txi_filtered %>% filter(species == "pk") %>% count()))
Number of unique genes per sample:
gene_counts_txi_filtered %>%
summarise(across(starts_with("104974"), ~sum(!is.na(.x))))
gene_counts_txi_filtered %>%
to_long() %>%
group_by(sample) %>%
summarize(n = n(),
n_10 = sum(counts >= 10)
)
Number of unique Pk genes per sample (after filtering):
gene_counts_txi_filtered %>%
filter(species=="pk") %>%
summarise(across(starts_with("104974"), ~sum(!is.na(.x))))
gene_counts_txi_filtered %>%
filter(species=="pk") %>%
to_long() %>%
group_by(sample) %>%
summarize(n = n(),
n_10 = sum(counts >= 10)
)
Number of gene counts per sample:
colSums(gene_counts_txi %>% select(starts_with("104974")))
## 104974-001-001_S1_L001 104974-001-002_S1_L001 104974-001-003_S1_L001
## 22866933 33913208 12906988
## 104974-001-004_S1_L001 104974-001-005_S1_L001 104974-001-006_S1_L001
## 7437720 11446209 8903155
## 104974-001-007_S1_L001 104974-001-008_S1_L001 104974-001-009_S1_L001
## 34102029 26530404 43610258
## 104974-001-010_S1_L001 104974-001-011_S1_L001 104974-001-012_S1_L001
## 45326671 39091317 39997344
## 104974-001-013_S1_L001 104974-001-014_S1_L001 104974-001-015_S1_L001
## 61837211 47858788 74893472
## 104974-001-016_S1_L001
## 94099625
gene_counts_txi %>%
summarise(across(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`, sum)) %>%
to_long() %>%
left_join(samples, by = "sample") %>%
ggplot(aes(x = sample_name, y = counts)) +
geom_bar(stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Total gene counts") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
Number of Pk gene counts per sample:
colSums(gene_counts_txi %>% filter(species == "pk") %>% select(starts_with("104974")))
## 104974-001-001_S1_L001 104974-001-002_S1_L001 104974-001-003_S1_L001
## 516911 1222797 2797701
## 104974-001-004_S1_L001 104974-001-005_S1_L001 104974-001-006_S1_L001
## 2574296 4051264 232640
## 104974-001-007_S1_L001 104974-001-008_S1_L001 104974-001-009_S1_L001
## 14320126 13637513 34289699
## 104974-001-010_S1_L001 104974-001-011_S1_L001 104974-001-012_S1_L001
## 37905536 31095828 340147
## 104974-001-013_S1_L001 104974-001-014_S1_L001 104974-001-015_S1_L001
## 637366 724010 3099047
## 104974-001-016_S1_L001
## 1500227
gene_counts_txi %>% filter(species == "pk") %>%
summarise(across(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`, sum)) %>%
to_long() %>%
left_join(samples, by = "sample") %>%
ggplot(aes(x = sample_name, y = counts)) +
geom_bar(stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Pk gene counts") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))
The direct comparison of RPKM and TPM across samples is meaningful only when there are equal total RNAs between compared samples and the distribution of RNA populations are close to each other.
Make sure both samples use the same RNA isolation approach [poly(A)+ selection versus ribosomal RNA depletion]. If not, they should not be compared.
Check the fraction of the ribosomal, mitochondrial and globin RNAs, and the top highly expressed transcripts and see whether such RNAs constitute a very large part of the sequenced reads in a sample, and thus decrease the sequencing “real estate” available for the remaining genes in that sample. If the calculated fractions in two samples differ significantly, do not compare RPKM or TPM values directly.
TPM should never be used for quantitative comparisons across samples when the total RNA contents and its distributions are very different. However, under appropriate circumstances, TPM can still be useful for qualitative comparison such as PCA and clustering analysis.
(Zhao et al., 2020 - http://dx.doi.org/10.1261/rna.074922.120)
Looking at our samples, especially the ones prepared using the Illumina kit exhibit much higher total RNA content. Based on the output of Picard MarkDuplicates, the differences can at least partially be attributed to the number of PCR duplicates. The proportion of duplicates is comparable across kits, but they do appear to be consistently higher for some depletion methods (no filter > centpip > plasmo > pmacs > cellulose).
gene_counts_txi_filtered %>%
summarise(across(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`, sum)) %>%
to_long() %>%
left_join(samples, by = "sample") %>%
ggplot(aes(x = sample_short, y = counts, fill = kit_name)) +
geom_bar(stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
# labels = c("mRNA Illumina", "mRNA Qiagen globin depletion", "mRNA Qiagen globin/rRNA depletion", "tRNA Qiagen globin/rRNA depletion")) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Number of gene-level counts", fill = "Kit")
pcr_dups %>%
left_join(samples, by = join_by(Sample == sample)) %>%
mutate(READS_IN_DUPLICATE_PAIRS = 2 * READ_PAIR_DUPLICATES) %>%
mutate(READS_IN_UNIQUE_PAIRS = 2 * (READ_PAIRS_EXAMINED - READ_PAIR_DUPLICATES)) %>%
mutate(READS_IN_UNIQUE_UNPAIRED = UNPAIRED_READS_EXAMINED - UNPAIRED_READ_DUPLICATES) %>%
mutate(READS_IN_DUPLICATE_PAIRS_OPTICAL = 2 * READ_PAIR_OPTICAL_DUPLICATES) %>%
mutate(READS_IN_DUPLICATE_PAIRS_NONOPTICAL = READS_IN_DUPLICATE_PAIRS - READS_IN_DUPLICATE_PAIRS_OPTICAL) %>%
mutate(READS_IN_DUPLICATE_UNPAIRED = UNPAIRED_READ_DUPLICATES) %>%
mutate(READS_UNMAPPED = UNMAPPED_READS) %>%
select(c(sample_name, condition, kit, READS_IN_UNIQUE_PAIRS, READS_IN_UNIQUE_UNPAIRED, READS_IN_DUPLICATE_PAIRS_OPTICAL, READS_IN_DUPLICATE_PAIRS_NONOPTICAL, READS_IN_DUPLICATE_UNPAIRED)) %>%
pivot_longer(cols = 4:8, names_to = "type", values_to = "counts") %>%
ggplot(aes(x = sample_name, y = counts, fill = type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Number of reads", fill = "Picard deduplication stats")
The output reported by samtool flagstats and STAR’s
Log.final.out files cannot be compared directly: the latter
reports counts of read-pairs as opposed to individual reads, and the
former does not contain unmapped reads (these are not added to the
.bam outputs by default) while also counting multi-mapped
reads. We opted to report the STAR statistics, but the flagstat output
is included for completion’s sake.
star_stats
star_stats %>%
pivot_longer(cols = 2:6, names_to = "type", values_to = "counts") %>%
left_join(samples, by = join_by(Sample == sample)) %>%
ggplot(aes(x = sample_name, y = counts, fill = type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Number of read pairs", fill = "Mapping stats")
mapping_stats_df
mapping_stats_df %>%
left_join(samples, by = "sample") %>%
pivot_longer(cols = flagstat_primary_pk:flagstat_primary_human, names_to = "type", values_to = "counts") %>%
ggplot(aes(x = sample_name, y = counts, fill = type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Number of primary reads", fill = "Samtools flagstat")
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-B2M | B2M | NA | human | human | 104974-001-001_S1_L001 | 27679.223 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-001_S1_L001 | 27104.637 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-001_S1_L001 | 22578.637 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-001_S1_L001 | 20902.902 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-001_S1_L001 | 19021.278 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-001_S1_L001 | 17762.379 |
| gene-RNR1 | RNR1 | NA | human | human_rRNA | 104974-001-001_S1_L001 | 14834.316 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-001_S1_L001 | 14662.673 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-001_S1_L001 | 11625.447 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-001_S1_L001 | 11335.273 |
| gene-FTL | FTL | NA | human | human | 104974-001-001_S1_L001 | 10562.604 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-001_S1_L001 | 10466.481 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-001_S1_L001 | 9533.806 |
| gene-ND2 | ND2 | NA | human | human | 104974-001-001_S1_L001 | 9363.211 |
| gene-RPLP2 | RPLP2 | NA | human | human | 104974-001-001_S1_L001 | 9110.287 |
| gene-RPS20 | RPS20 | NA | human | human | 104974-001-001_S1_L001 | 8465.206 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-001_S1_L001 | 8463.110 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-001_S1_L001 | 7906.045 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-001_S1_L001 | 7773.208 |
| gene-ND1 | ND1 | NA | human | human | 104974-001-001_S1_L001 | 7213.298 |
| gene-RPS21 | RPS21 | NA | human | human | 104974-001-001_S1_L001 | 7198.383 |
| gene-EEF1A1 | EEF1A1 | NA | human | human | 104974-001-001_S1_L001 | 6880.381 |
| gene-RPS24 | RPS24 | NA | human | human | 104974-001-001_S1_L001 | 6067.828 |
| gene-RPL30 | RPL30 | NA | human | human | 104974-001-001_S1_L001 | 5941.840 |
| gene-B2M-2 | B2M | NA | human | human | 104974-001-001_S1_L001 | 5852.971 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-B2M | B2M | NA | human | human | 104974-001-002_S1_L001 | 34855.539 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-002_S1_L001 | 29430.692 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-002_S1_L001 | 24583.661 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-002_S1_L001 | 24327.359 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-002_S1_L001 | 16508.778 |
| gene-RNR1 | RNR1 | NA | human | human_rRNA | 104974-001-002_S1_L001 | 16214.239 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-002_S1_L001 | 15349.098 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-002_S1_L001 | 13599.609 |
| gene-FTL | FTL | NA | human | human | 104974-001-002_S1_L001 | 13013.354 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-002_S1_L001 | 11699.378 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-002_S1_L001 | 11343.645 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-002_S1_L001 | 10378.688 |
| gene-ND2 | ND2 | NA | human | human | 104974-001-002_S1_L001 | 9614.317 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-002_S1_L001 | 8803.669 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-002_S1_L001 | 8432.933 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-002_S1_L001 | 8236.118 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-002_S1_L001 | 8169.655 |
| gene-RPLP2 | RPLP2 | NA | human | human | 104974-001-002_S1_L001 | 7483.747 |
| gene-ND1 | ND1 | NA | human | human | 104974-001-002_S1_L001 | 7330.897 |
| gene-EEF1A1 | EEF1A1 | NA | human | human | 104974-001-002_S1_L001 | 7075.555 |
| gene-B2M-2 | B2M | NA | human | human | 104974-001-002_S1_L001 | 7055.202 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-002_S1_L001 | 6973.821 |
| gene-RPS20 | RPS20 | NA | human | human | 104974-001-002_S1_L001 | 6841.469 |
| gene-SRGN | SRGN | NA | human | human | 104974-001-002_S1_L001 | 6536.149 |
| gene-ND4 | ND4 | NA | human | human | 104974-001-002_S1_L001 | 5657.471 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-003_S1_L001 | 46533.376 |
| gene-FTL | FTL | NA | human | human | 104974-001-003_S1_L001 | 29467.269 |
| gene-HBB | HBB | NA | human | human_globin | 104974-001-003_S1_L001 | 27466.991 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-003_S1_L001 | 23643.612 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-003_S1_L001 | 21219.997 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-003_S1_L001 | 20864.756 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-003_S1_L001 | 19293.910 |
| gene-UBB | UBB | NA | human | human | 104974-001-003_S1_L001 | 17736.970 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-003_S1_L001 | 16262.189 |
| gene-B2M | B2M | NA | human | human | 104974-001-003_S1_L001 | 14549.406 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-003_S1_L001 | 12990.299 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-003_S1_L001 | 11766.569 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-003_S1_L001 | 9783.310 |
| gene-FKBP8 | FKBP8 | NA | human | human | 104974-001-003_S1_L001 | 8678.414 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-003_S1_L001 | 7978.580 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-003_S1_L001 | 7889.953 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-003_S1_L001 | 7762.571 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-003_S1_L001 | 7712.087 |
| gene-RPLP2 | RPLP2 | NA | human | human | 104974-001-003_S1_L001 | 7343.200 |
| gene-RNR1 | RNR1 | NA | human | human_rRNA | 104974-001-003_S1_L001 | 6908.576 |
| gene-RPL30 | RPL30 | NA | human | human | 104974-001-003_S1_L001 | 6190.096 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-003_S1_L001 | 6067.999 |
| gene-RPS11 | RPS11 | NA | human | human | 104974-001-003_S1_L001 | 5505.873 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-003_S1_L001 | 5258.114 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-003_S1_L001 | 5022.868 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-004_S1_L001 | 52755.665 |
| gene-HBB | HBB | NA | human | human_globin | 104974-001-004_S1_L001 | 22097.339 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-004_S1_L001 | 21210.681 |
| gene-FTL | FTL | NA | human | human | 104974-001-004_S1_L001 | 19832.883 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-004_S1_L001 | 13148.031 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-004_S1_L001 | 12112.246 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-004_S1_L001 | 11859.816 |
| gene-B2M | B2M | NA | human | human | 104974-001-004_S1_L001 | 10750.176 |
| gene-UBB | UBB | NA | human | human | 104974-001-004_S1_L001 | 9926.561 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-004_S1_L001 | 9832.230 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-004_S1_L001 | 8986.872 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-004_S1_L001 | 8314.573 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-004_S1_L001 | 7066.256 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-004_S1_L001 | 7006.419 |
| gene-FKBP8 | FKBP8 | NA | human | human | 104974-001-004_S1_L001 | 6986.322 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-004_S1_L001 | 6966.357 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-004_S1_L001 | 6785.591 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-004_S1_L001 | 6779.247 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-004_S1_L001 | 6315.816 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-004_S1_L001 | 6261.843 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-004_S1_L001 | 6113.571 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-004_S1_L001 | 6088.562 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-004_S1_L001 | 5573.493 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-004_S1_L001 | 5157.936 |
| gene-RPLP2 | RPLP2 | NA | human | human | 104974-001-004_S1_L001 | 5087.297 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-005_S1_L001 | 49591.793 |
| gene-FTL | FTL | NA | human | human | 104974-001-005_S1_L001 | 44351.515 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-005_S1_L001 | 36767.580 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-005_S1_L001 | 35030.120 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-005_S1_L001 | 30090.973 |
| gene-HBB | HBB | NA | human | human_globin | 104974-001-005_S1_L001 | 29687.257 |
| gene-UBB | UBB | NA | human | human | 104974-001-005_S1_L001 | 26975.937 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-005_S1_L001 | 26033.024 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-005_S1_L001 | 20473.960 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-005_S1_L001 | 17038.159 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-005_S1_L001 | 16668.188 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-005_S1_L001 | 14476.128 |
| gene-FKBP8 | FKBP8 | NA | human | human | 104974-001-005_S1_L001 | 13027.216 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-005_S1_L001 | 12949.204 |
| gene-RNR1 | RNR1 | NA | human | human_rRNA | 104974-001-005_S1_L001 | 8443.443 |
| gene-RPLP2 | RPLP2 | NA | human | human | 104974-001-005_S1_L001 | 7333.275 |
| gene-RPS11 | RPS11 | NA | human | human | 104974-001-005_S1_L001 | 7058.348 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-005_S1_L001 | 6826.241 |
| gene-RPL30 | RPL30 | NA | human | human | 104974-001-005_S1_L001 | 6393.033 |
| gene-GUK1 | GUK1 | NA | human | human | 104974-001-005_S1_L001 | 6026.648 |
| gene-RPL12 | RPL12 | NA | human | human | 104974-001-005_S1_L001 | 5719.974 |
| gene-ADIPOR1 | ADIPOR1 | NA | human | human | 104974-001-005_S1_L001 | 5600.955 |
| gene-PKNH_1355500 | gene-PKNH_1355500 | protein_coding | pk | pk | 104974-001-005_S1_L001 | 5244.759 |
| gene-RPS15 | RPS15 | NA | human | human | 104974-001-005_S1_L001 | 4708.231 |
| gene-RPS24 | RPS24 | NA | human | human | 104974-001-005_S1_L001 | 4399.440 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-B2M | B2M | NA | human | human | 104974-001-006_S1_L001 | 28553.397 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-006_S1_L001 | 19236.293 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-006_S1_L001 | 18303.677 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-006_S1_L001 | 18124.259 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-006_S1_L001 | 16150.164 |
| gene-B2M-2 | B2M | NA | human | human | 104974-001-006_S1_L001 | 15785.354 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-006_S1_L001 | 15074.849 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-006_S1_L001 | 14709.890 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-006_S1_L001 | 14454.597 |
| gene-ND2 | ND2 | NA | human | human | 104974-001-006_S1_L001 | 14204.682 |
| gene-FTL | FTL | NA | human | human | 104974-001-006_S1_L001 | 11557.118 |
| gene-ND1 | ND1 | NA | human | human | 104974-001-006_S1_L001 | 10850.009 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-006_S1_L001 | 10710.461 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-006_S1_L001 | 9942.583 |
| gene-ND4 | ND4 | NA | human | human | 104974-001-006_S1_L001 | 9064.038 |
| gene-ATP8 | ATP8 | NA | human | human | 104974-001-006_S1_L001 | 8615.542 |
| gene-EEF1A1 | EEF1A1 | NA | human | human | 104974-001-006_S1_L001 | 8459.874 |
| gene-RPS20 | RPS20 | NA | human | human | 104974-001-006_S1_L001 | 7580.700 |
| gene-CYTB | CYTB | NA | human | human | 104974-001-006_S1_L001 | 7165.345 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-006_S1_L001 | 7005.028 |
| gene-RPL19 | RPL19 | NA | human | human | 104974-001-006_S1_L001 | 6904.254 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-006_S1_L001 | 6793.811 |
| gene-RPL34 | RPL34 | NA | human | human | 104974-001-006_S1_L001 | 5924.000 |
| gene-RPL9 | RPL9 | NA | human | human | 104974-001-006_S1_L001 | 5814.001 |
| gene-FTH1 | FTH1 | NA | human | human | 104974-001-006_S1_L001 | 5571.215 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-RN7SL2 | RN7SL2 | NA | human | human | 104974-001-007_S1_L001 | 131068.139 |
| gene-RN7SL1 | RN7SL1 | NA | human | human | 104974-001-007_S1_L001 | 92664.227 |
| gene-PKNH_0321100 | gene-PKNH_0321100 | rRNA | pk | pk_rRNA | 104974-001-007_S1_L001 | 73099.379 |
| gene-RN7SK | RN7SK | NA | human | human | 104974-001-007_S1_L001 | 70738.871 |
| gene-PKNH_1001000 | gene-PKNH_1001000 | rRNA | pk | pk_rRNA | 104974-001-007_S1_L001 | 58787.433 |
| gene-PKNH_0320900 | gene-PKNH_0320900 | rRNA | pk | pk_rRNA | 104974-001-007_S1_L001 | 27365.645 |
| gene-PKNH_1001200 | gene-PKNH_1001200 | rRNA | pk | pk_rRNA | 104974-001-007_S1_L001 | 25716.594 |
| gene-PKNH_0321000 | gene-PKNH_0321000 | rRNA | pk | pk_rRNA | 104974-001-007_S1_L001 | 17270.634 |
| gene-RMRP | RMRP | NA | human | human | 104974-001-007_S1_L001 | 14366.099 |
| gene-B2M | B2M | NA | human | human | 104974-001-007_S1_L001 | 12165.333 |
| gene-PKNH_1339200 | gene-PKNH_1339200 | ncRNA | pk | pk | 104974-001-007_S1_L001 | 8712.438 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-007_S1_L001 | 7823.012 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-007_S1_L001 | 7685.253 |
| gene-PKNH_1001100 | gene-PKNH_1001100 | rRNA | pk | pk_rRNA | 104974-001-007_S1_L001 | 7579.708 |
| gene-ND2 | ND2 | NA | human | human | 104974-001-007_S1_L001 | 6852.100 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-007_S1_L001 | 6031.814 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-007_S1_L001 | 5463.136 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-007_S1_L001 | 5451.582 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-007_S1_L001 | 5309.715 |
| gene-ND1 | ND1 | NA | human | human | 104974-001-007_S1_L001 | 5106.975 |
| gene-ND4 | ND4 | NA | human | human | 104974-001-007_S1_L001 | 4979.826 |
| gene-PKNH_0935200 | gene-PKNH_0935200 | snRNA | pk | pk | 104974-001-007_S1_L001 | 4468.936 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-007_S1_L001 | 4455.028 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-007_S1_L001 | 4377.385 |
| gene-RPPH1 | RPPH1 | NA | human | human | 104974-001-007_S1_L001 | 4178.458 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-RN7SL2 | RN7SL2 | NA | human | human | 104974-001-008_S1_L001 | 139279.456 |
| gene-RN7SL1 | RN7SL1 | NA | human | human | 104974-001-008_S1_L001 | 88237.728 |
| gene-PKNH_1001000 | gene-PKNH_1001000 | rRNA | pk | pk_rRNA | 104974-001-008_S1_L001 | 78313.591 |
| gene-PKNH_0321100 | gene-PKNH_0321100 | rRNA | pk | pk_rRNA | 104974-001-008_S1_L001 | 76693.896 |
| gene-RN7SK | RN7SK | NA | human | human | 104974-001-008_S1_L001 | 76096.199 |
| gene-PKNH_0320900 | gene-PKNH_0320900 | rRNA | pk | pk_rRNA | 104974-001-008_S1_L001 | 30296.682 |
| gene-PKNH_1001200 | gene-PKNH_1001200 | rRNA | pk | pk_rRNA | 104974-001-008_S1_L001 | 27906.598 |
| gene-PKNH_0321000 | gene-PKNH_0321000 | rRNA | pk | pk_rRNA | 104974-001-008_S1_L001 | 23171.875 |
| gene-RMRP | RMRP | NA | human | human | 104974-001-008_S1_L001 | 16032.224 |
| gene-PKNH_1001100 | gene-PKNH_1001100 | rRNA | pk | pk_rRNA | 104974-001-008_S1_L001 | 14289.531 |
| gene-B2M | B2M | NA | human | human | 104974-001-008_S1_L001 | 12222.046 |
| gene-PKNH_1339200 | gene-PKNH_1339200 | ncRNA | pk | pk | 104974-001-008_S1_L001 | 9740.299 |
| gene-PKNH_0935200 | gene-PKNH_0935200 | snRNA | pk | pk | 104974-001-008_S1_L001 | 7697.558 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-008_S1_L001 | 7411.967 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-008_S1_L001 | 6867.503 |
| gene-ND2 | ND2 | NA | human | human | 104974-001-008_S1_L001 | 5897.639 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-008_S1_L001 | 5487.208 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-008_S1_L001 | 5483.937 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-008_S1_L001 | 5235.476 |
| gene-RPPH1 | RPPH1 | NA | human | human | 104974-001-008_S1_L001 | 5041.748 |
| gene-RPPH1-2 | RPPH1 | NA | human | human | 104974-001-008_S1_L001 | 4996.489 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-008_S1_L001 | 4855.997 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-008_S1_L001 | 4804.481 |
| gene-ND1 | ND1 | NA | human | human | 104974-001-008_S1_L001 | 4769.538 |
| gene-ND4 | ND4 | NA | human | human | 104974-001-008_S1_L001 | 4491.039 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-RN7SL2 | RN7SL2 | NA | human | human | 104974-001-009_S1_L001 | 145681.116 |
| gene-PKNH_0321100 | gene-PKNH_0321100 | rRNA | pk | pk_rRNA | 104974-001-009_S1_L001 | 144323.085 |
| gene-PKNH_1001000 | gene-PKNH_1001000 | rRNA | pk | pk_rRNA | 104974-001-009_S1_L001 | 113693.416 |
| gene-RN7SL1 | RN7SL1 | NA | human | human | 104974-001-009_S1_L001 | 97956.989 |
| gene-PKNH_0320900 | gene-PKNH_0320900 | rRNA | pk | pk_rRNA | 104974-001-009_S1_L001 | 58482.134 |
| gene-PKNH_0321000 | gene-PKNH_0321000 | rRNA | pk | pk_rRNA | 104974-001-009_S1_L001 | 54071.123 |
| gene-PKNH_1001200 | gene-PKNH_1001200 | rRNA | pk | pk_rRNA | 104974-001-009_S1_L001 | 53881.108 |
| gene-PKNH_1001100 | gene-PKNH_1001100 | rRNA | pk | pk_rRNA | 104974-001-009_S1_L001 | 24293.746 |
| gene-RN7SK | RN7SK | NA | human | human | 104974-001-009_S1_L001 | 20907.443 |
| gene-PKNH_0935200 | gene-PKNH_0935200 | snRNA | pk | pk | 104974-001-009_S1_L001 | 18472.293 |
| gene-PKNH_1339200 | gene-PKNH_1339200 | ncRNA | pk | pk | 104974-001-009_S1_L001 | 12768.772 |
| gene-HBB | HBB | NA | human | human_globin | 104974-001-009_S1_L001 | 6401.856 |
| gene-FTL | FTL | NA | human | human | 104974-001-009_S1_L001 | 5156.650 |
| gene-UBB | UBB | NA | human | human | 104974-001-009_S1_L001 | 3804.390 |
| gene-ADIPOR1 | ADIPOR1 | NA | human | human | 104974-001-009_S1_L001 | 3777.510 |
| gene-RPPH1-2 | RPPH1 | NA | human | human | 104974-001-009_S1_L001 | 3718.740 |
| gene-RPPH1 | RPPH1 | NA | human | human | 104974-001-009_S1_L001 | 3692.364 |
| gene-RNY1 | RNY1 | NA | human | human | 104974-001-009_S1_L001 | 3687.490 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-009_S1_L001 | 3199.299 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-009_S1_L001 | 3180.375 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-009_S1_L001 | 2826.477 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-009_S1_L001 | 2678.665 |
| gene-RMRP | RMRP | NA | human | human | 104974-001-009_S1_L001 | 2494.538 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-009_S1_L001 | 2312.951 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-009_S1_L001 | 2094.309 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-PKNH_0321100 | gene-PKNH_0321100 | rRNA | pk | pk_rRNA | 104974-001-010_S1_L001 | 181501.107 |
| gene-PKNH_1001000 | gene-PKNH_1001000 | rRNA | pk | pk_rRNA | 104974-001-010_S1_L001 | 143352.902 |
| gene-RN7SL2 | RN7SL2 | NA | human | human | 104974-001-010_S1_L001 | 86488.492 |
| gene-PKNH_0320900 | gene-PKNH_0320900 | rRNA | pk | pk_rRNA | 104974-001-010_S1_L001 | 70146.903 |
| gene-PKNH_1001200 | gene-PKNH_1001200 | rRNA | pk | pk_rRNA | 104974-001-010_S1_L001 | 66180.869 |
| gene-RN7SL1 | RN7SL1 | NA | human | human | 104974-001-010_S1_L001 | 60353.922 |
| gene-PKNH_0321000 | gene-PKNH_0321000 | rRNA | pk | pk_rRNA | 104974-001-010_S1_L001 | 54735.717 |
| gene-PKNH_1001100 | gene-PKNH_1001100 | rRNA | pk | pk_rRNA | 104974-001-010_S1_L001 | 20313.046 |
| gene-PKNH_0935200 | gene-PKNH_0935200 | snRNA | pk | pk | 104974-001-010_S1_L001 | 16268.334 |
| gene-RN7SK | RN7SK | NA | human | human | 104974-001-010_S1_L001 | 13867.096 |
| gene-PKNH_1339200 | gene-PKNH_1339200 | ncRNA | pk | pk | 104974-001-010_S1_L001 | 9108.890 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-010_S1_L001 | 3993.278 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-010_S1_L001 | 3889.037 |
| gene-HBB | HBB | NA | human | human_globin | 104974-001-010_S1_L001 | 3806.526 |
| gene-ADIPOR1 | ADIPOR1 | NA | human | human | 104974-001-010_S1_L001 | 3607.743 |
| gene-FTL | FTL | NA | human | human | 104974-001-010_S1_L001 | 3459.588 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-010_S1_L001 | 3183.081 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-010_S1_L001 | 2799.467 |
| gene-RPPH1 | RPPH1 | NA | human | human | 104974-001-010_S1_L001 | 2659.526 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-010_S1_L001 | 2645.063 |
| gene-RPPH1-2 | RPPH1 | NA | human | human | 104974-001-010_S1_L001 | 2627.532 |
| gene-PKNH_1132600 | gene-PKNH_1132600 | protein_coding | pk | pk | 104974-001-010_S1_L001 | 2592.234 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-010_S1_L001 | 2414.909 |
| gene-UBB | UBB | NA | human | human | 104974-001-010_S1_L001 | 2265.004 |
| gene-RMRP | RMRP | NA | human | human | 104974-001-010_S1_L001 | 2211.677 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-RN7SL2 | RN7SL2 | NA | human | human | 104974-001-011_S1_L001 | 161042.174 |
| gene-PKNH_0321100 | gene-PKNH_0321100 | rRNA | pk | pk_rRNA | 104974-001-011_S1_L001 | 147935.000 |
| gene-PKNH_1001000 | gene-PKNH_1001000 | rRNA | pk | pk_rRNA | 104974-001-011_S1_L001 | 115582.956 |
| gene-RN7SL1 | RN7SL1 | NA | human | human | 104974-001-011_S1_L001 | 108220.532 |
| gene-PKNH_0320900 | gene-PKNH_0320900 | rRNA | pk | pk_rRNA | 104974-001-011_S1_L001 | 61256.956 |
| gene-PKNH_1001200 | gene-PKNH_1001200 | rRNA | pk | pk_rRNA | 104974-001-011_S1_L001 | 55243.977 |
| gene-PKNH_0321000 | gene-PKNH_0321000 | rRNA | pk | pk_rRNA | 104974-001-011_S1_L001 | 42015.846 |
| gene-PKNH_1001100 | gene-PKNH_1001100 | rRNA | pk | pk_rRNA | 104974-001-011_S1_L001 | 18712.764 |
| gene-RN7SK | RN7SK | NA | human | human | 104974-001-011_S1_L001 | 10855.497 |
| gene-PKNH_0935200 | gene-PKNH_0935200 | snRNA | pk | pk | 104974-001-011_S1_L001 | 10217.904 |
| gene-PKNH_1339200 | gene-PKNH_1339200 | ncRNA | pk | pk | 104974-001-011_S1_L001 | 7907.086 |
| gene-FTL | FTL | NA | human | human | 104974-001-011_S1_L001 | 6860.040 |
| gene-HBB | HBB | NA | human | human_globin | 104974-001-011_S1_L001 | 5702.090 |
| gene-ADIPOR1 | ADIPOR1 | NA | human | human | 104974-001-011_S1_L001 | 5452.125 |
| gene-UBB | UBB | NA | human | human | 104974-001-011_S1_L001 | 5000.284 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-011_S1_L001 | 4800.788 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-011_S1_L001 | 4511.897 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-011_S1_L001 | 4355.019 |
| gene-RPPH1-2 | RPPH1 | NA | human | human | 104974-001-011_S1_L001 | 3979.430 |
| gene-RPPH1 | RPPH1 | NA | human | human | 104974-001-011_S1_L001 | 3978.027 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-011_S1_L001 | 3085.912 |
| gene-RNY1 | RNY1 | NA | human | human | 104974-001-011_S1_L001 | 2933.827 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-011_S1_L001 | 2915.853 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-011_S1_L001 | 2894.726 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-011_S1_L001 | 2852.671 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBB | HBB | NA | human | human_globin | 104974-001-012_S1_L001 | 366480.180 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-012_S1_L001 | 263245.083 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-012_S1_L001 | 98009.922 |
| gene-B2M | B2M | NA | human | human | 104974-001-012_S1_L001 | 6733.582 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-012_S1_L001 | 5784.927 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-012_S1_L001 | 5749.014 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-012_S1_L001 | 4593.144 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-012_S1_L001 | 4190.916 |
| gene-LOC124905315 | LOC124905315 | NA | human | human | 104974-001-012_S1_L001 | 3825.278 |
| gene-RNR1 | RNR1 | NA | human | human_rRNA | 104974-001-012_S1_L001 | 3549.196 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-012_S1_L001 | 3347.079 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-012_S1_L001 | 3045.938 |
| gene-FTL | FTL | NA | human | human | 104974-001-012_S1_L001 | 3016.675 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-012_S1_L001 | 2750.724 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-012_S1_L001 | 2577.031 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-012_S1_L001 | 2375.402 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-012_S1_L001 | 2212.215 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-012_S1_L001 | 2207.033 |
| gene-RPL39 | RPL39 | NA | human | human | 104974-001-012_S1_L001 | 2132.982 |
| gene-ATP8 | ATP8 | NA | human | human | 104974-001-012_S1_L001 | 1919.579 |
| gene-EEF1A1 | EEF1A1 | NA | human | human | 104974-001-012_S1_L001 | 1903.976 |
| gene-RNA28SN2 | RNA28SN2 | NA | human | human_rRNA | 104974-001-012_S1_L001 | 1875.808 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-012_S1_L001 | 1867.009 |
| gene-RNA18SN2 | RNA18SN2 | NA | human | human_rRNA | 104974-001-012_S1_L001 | 1677.114 |
| gene-RNA18SN3 | RNA18SN3 | NA | human | human_rRNA | 104974-001-012_S1_L001 | 1671.307 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBB | HBB | NA | human | human_globin | 104974-001-013_S1_L001 | 388222.402 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-013_S1_L001 | 289856.854 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-013_S1_L001 | 103444.535 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-013_S1_L001 | 6019.643 |
| gene-B2M | B2M | NA | human | human | 104974-001-013_S1_L001 | 5886.942 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-013_S1_L001 | 5240.822 |
| gene-S100A8 | S100A8 | NA | human | human | 104974-001-013_S1_L001 | 4629.461 |
| gene-RNR1 | RNR1 | NA | human | human_rRNA | 104974-001-013_S1_L001 | 3265.530 |
| gene-FTL | FTL | NA | human | human | 104974-001-013_S1_L001 | 3136.281 |
| gene-TMSB4X | TMSB4X | NA | human | human | 104974-001-013_S1_L001 | 2718.495 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-013_S1_L001 | 2457.827 |
| gene-COX3 | COX3 | NA | human | human | 104974-001-013_S1_L001 | 2138.235 |
| gene-ND3 | ND3 | NA | human | human | 104974-001-013_S1_L001 | 2125.318 |
| gene-RPS27 | RPS27 | NA | human | human | 104974-001-013_S1_L001 | 2021.781 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-013_S1_L001 | 1910.314 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-013_S1_L001 | 1851.973 |
| gene-LOC124905315 | LOC124905315 | NA | human | human | 104974-001-013_S1_L001 | 1741.411 |
| gene-COX2 | COX2 | NA | human | human | 104974-001-013_S1_L001 | 1678.269 |
| gene-FTH1 | FTH1 | NA | human | human | 104974-001-013_S1_L001 | 1635.636 |
| gene-RPL39 | RPL39 | NA | human | human | 104974-001-013_S1_L001 | 1558.862 |
| gene-ATP8 | ATP8 | NA | human | human | 104974-001-013_S1_L001 | 1532.902 |
| gene-EEF1A1 | EEF1A1 | NA | human | human | 104974-001-013_S1_L001 | 1446.312 |
| gene-ATP6 | ATP6 | NA | human | human | 104974-001-013_S1_L001 | 1419.527 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-013_S1_L001 | 1331.099 |
| gene-COX1 | COX1 | NA | human | human | 104974-001-013_S1_L001 | 1240.701 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBB | HBB | NA | human | human_globin | 104974-001-014_S1_L001 | 523264.4084 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-014_S1_L001 | 284480.8538 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-014_S1_L001 | 114371.1571 |
| gene-LOC124905315 | LOC124905315 | NA | human | human | 104974-001-014_S1_L001 | 2918.1146 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 2536.0602 |
| gene-FTL | FTL | NA | human | human | 104974-001-014_S1_L001 | 1871.6873 |
| gene-RNA18SN3-2 | RNA18SN3 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 1543.4707 |
| gene-RNA18SN4 | RNA18SN4 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 1542.5888 |
| gene-RNA18SN3 | RNA18SN3 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 1542.4997 |
| gene-RNA18SN2 | RNA18SN2 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 1535.9067 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-014_S1_L001 | 1366.6827 |
| gene-RNA28SN2 | RNA28SN2 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 1296.8516 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 1166.9362 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-014_S1_L001 | 1068.1666 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-014_S1_L001 | 1028.5515 |
| gene-UBB | UBB | NA | human | human | 104974-001-014_S1_L001 | 892.5544 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-014_S1_L001 | 851.2235 |
| gene-HBG2 | HBG2 | NA | human | human_globin | 104974-001-014_S1_L001 | 842.1872 |
| gene-FKBP8 | FKBP8 | NA | human | human | 104974-001-014_S1_L001 | 814.0800 |
| gene-B2M | B2M | NA | human | human | 104974-001-014_S1_L001 | 783.0767 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-014_S1_L001 | 715.1521 |
| gene-RNA5-8SN4 | RNA5-8SN4 | NA | human | human_rRNA | 104974-001-014_S1_L001 | 636.0227 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-014_S1_L001 | 599.3413 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-014_S1_L001 | 578.5453 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-014_S1_L001 | 555.1666 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBB | HBB | NA | human | human_globin | 104974-001-015_S1_L001 | 448423.0722 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-015_S1_L001 | 316409.7173 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-015_S1_L001 | 127791.9575 |
| gene-LOC124905315 | LOC124905315 | NA | human | human | 104974-001-015_S1_L001 | 3596.4548 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 3411.2697 |
| gene-RNA18SN2 | RNA18SN2 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 1762.2669 |
| gene-RNA18SN4 | RNA18SN4 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 1756.8958 |
| gene-RNA18SN3 | RNA18SN3 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 1753.9691 |
| gene-RNA18SN3-2 | RNA18SN3 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 1750.1382 |
| gene-FTL | FTL | NA | human | human | 104974-001-015_S1_L001 | 1660.3870 |
| gene-RNA28SN2 | RNA28SN2 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 1615.4719 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-015_S1_L001 | 1099.8989 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-015_S1_L001 | 1020.6326 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-015_S1_L001 | 925.7916 |
| gene-FKBP8 | FKBP8 | NA | human | human | 104974-001-015_S1_L001 | 917.5296 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-015_S1_L001 | 858.6056 |
| gene-B2M | B2M | NA | human | human | 104974-001-015_S1_L001 | 792.5917 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-015_S1_L001 | 779.4010 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-015_S1_L001 | 768.8011 |
| gene-UBB | UBB | NA | human | human | 104974-001-015_S1_L001 | 734.8697 |
| gene-S100A9 | S100A9 | NA | human | human | 104974-001-015_S1_L001 | 692.6973 |
| gene-HBG2 | HBG2 | NA | human | human_globin | 104974-001-015_S1_L001 | 663.0136 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-015_S1_L001 | 576.5540 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-015_S1_L001 | 567.3799 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-015_S1_L001 | 566.2701 |
| gene_id | gene_name | gene_biotype | species | gene_type | sample | counts |
|---|---|---|---|---|---|---|
| gene-HBB | HBB | NA | human | human_globin | 104974-001-016_S1_L001 | 514171.8546 |
| gene-HBA2 | HBA2 | NA | human | human_globin | 104974-001-016_S1_L001 | 301490.6876 |
| gene-HBA1 | HBA1 | NA | human | human_globin | 104974-001-016_S1_L001 | 122126.6477 |
| gene-LOC124905315 | LOC124905315 | NA | human | human | 104974-001-016_S1_L001 | 3512.6965 |
| gene-RNA28SN4 | RNA28SN4 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 3033.8382 |
| gene-RNA18SN2 | RNA18SN2 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 2096.6635 |
| gene-RNA18SN3 | RNA18SN3 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 2086.6004 |
| gene-RNA18SN4 | RNA18SN4 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 2073.9575 |
| gene-RNA18SN3-2 | RNA18SN3 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 2066.1062 |
| gene-FTL | FTL | NA | human | human | 104974-001-016_S1_L001 | 1780.1712 |
| gene-RNA28SN2 | RNA28SN2 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 1534.3552 |
| gene-RPS12 | RPS12 | NA | human | human | 104974-001-016_S1_L001 | 1288.8221 |
| gene-UBA52 | UBA52 | NA | human | human | 104974-001-016_S1_L001 | 1087.6524 |
| gene-RPL21 | RPL21 | NA | human | human | 104974-001-016_S1_L001 | 990.7414 |
| gene-RNR2 | RNR2 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 990.5111 |
| gene-UBB | UBB | NA | human | human | 104974-001-016_S1_L001 | 922.4129 |
| gene-HBG2 | HBG2 | NA | human | human_globin | 104974-001-016_S1_L001 | 911.4972 |
| gene-GYPC | GYPC | NA | human | human | 104974-001-016_S1_L001 | 883.1664 |
| gene-FKBP8 | FKBP8 | NA | human | human | 104974-001-016_S1_L001 | 818.6246 |
| gene-OAZ1 | OAZ1 | NA | human | human | 104974-001-016_S1_L001 | 600.4898 |
| gene-RPL41 | RPL41 | NA | human | human | 104974-001-016_S1_L001 | 559.7779 |
| gene-SLC25A39 | SLC25A39 | NA | human | human | 104974-001-016_S1_L001 | 555.5004 |
| gene-RNA5-8SN4 | RNA5-8SN4 | NA | human | human_rRNA | 104974-001-016_S1_L001 | 525.1435 |
| gene-RN7SL2 | RN7SL2 | NA | human | human | 104974-001-016_S1_L001 | 410.6589 |
| gene-PKNH_0418600 | gene-PKNH_0418600 | protein_coding | pk | pk | 104974-001-016_S1_L001 | 380.8039 |
Also see figure in results/fastq-screen/ and results/multiqc-fastq-screen-samtools-pk/multiqc_report.html.
fastq_screen_transformed <- bind_rows(
fastq_screen %>%
mutate(`%_single_genome` = `%_One_hit_one_genome` + `%_Multiple_hits_one_genome`) %>%
mutate(`%_multiple_genomes` = `%_One_hit_multiple_genomes` + `%_Multiple_hits_multiple_genomes`) %>%
mutate(`No_hits` = `%_One_hit_multiple_genomes` + `%_Multiple_hits_multiple_genomes`) %>%
select(c(sample, Genome, `#Reads_processed`, `%_single_genome`, `%_multiple_genomes`)) %>%
pivot_wider(names_from = Genome, values_from = c(`%_single_genome`, `%_multiple_genomes`)) %>%
select(!`%_multiple_genomes_PknowlesiH`) %>%
rename(`%_multiple_genomes` = `%_multiple_genomes_Human`) %>%
mutate(pct = 1 - `%_single_genome_PknowlesiH` - `%_single_genome_Human` - `%_multiple_genomes`) %>%
select(sample, pct) %>%
mutate("Mapped reads" = "No hits"),
fastq_screen %>%
mutate(pct = `%_One_hit_one_genome` + `%_Multiple_hits_one_genome`) %>%
select(c(sample, Genome, pct)) %>%
rename("Mapped reads" = Genome),
fastq_screen %>%
mutate(pct = `%_One_hit_multiple_genomes` + `%_Multiple_hits_multiple_genomes`) %>%
select(c(sample, pct)) %>%
distinct(sample, .keep_all = TRUE) %>%
mutate("Mapped reads" = "Multiple genomes")
) %>%
# change order of levels here to adjust colours
mutate(`Mapped reads` = factor(`Mapped reads`,
levels = c("PknowlesiH", "Human", "Multiple genomes", "No hits"),
labels = c("P. knowlesi", "Human", "Multiple hits", "No hits")
)) %>%
left_join(samples, by = "sample")
ggplot(fastq_screen_transformed, aes(x = sample_short, y = pct, fill = `Mapped reads`)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T, limits = rev) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(fill = expression("FastQ Screen mapped reads"), x = "Sample") +
ylab(label = "% of mapped reads")
# Human reads
fastq_screen %>%
pivot_longer(
cols = starts_with("%"),
names_to = "fastq_screen_pct_type",
values_to = "fastq_screen_pct_value"
) %>%
filter(Genome == "Human") %>%
left_join(samples) %>%
ggplot(aes(x = sample_short, y = fastq_screen_pct_value, fill = fastq_screen_pct_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(fill = "FastQ screen statistics for human reads", x = "Sample", y = "% of mapped reads")
## Joining with `by = join_by(sample)`
# Pk reads
fastq_screen %>%
pivot_longer(
cols = starts_with("%"),
names_to = "fastq_screen_pct_type",
values_to = "fastq_screen_pct_value"
) %>%
filter(Genome == "PknowlesiH") %>%
left_join(samples) %>%
ggplot(aes(x = sample_short, y = fastq_screen_pct_value, fill = fastq_screen_pct_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(fill = expression("FastQ screen statistics for" ~ italic("P. knowlesi") ~ "reads"), x = "Sample", y = "% of mapped reads")
## Joining with `by = join_by(sample)`
gene_annotation %>%
# select rRNA and globins
filter(!gene_type %in% c("human", "pk")) %>%
left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
pivot_longer(
cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
names_to = "sample",
values_to = "counts"
) %>%
left_join(samples, by = "sample") %>%
group_by(gene_id) %>%
mutate(total = sum(counts)) %>%
ungroup() %>%
arrange(species, gene_type, desc(total)) %>%
mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
ggplot(aes(x = gene_name, y = counts, fill = sample_short)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(fill = "Sample") +
ylab(label = "TPM") +
xlab(label = "Gene name") +
facet_wrap(~kit_name) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale()))
Log TPM is often plotted instead of raw values, but can be misleading on barplot…
gene_annotation %>%
filter(!gene_type %in% c("human", "pk")) %>%
left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
pivot_longer(
cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
names_to = "sample",
values_to = "counts"
) %>%
left_join(samples, by = "sample") %>%
group_by(gene_id) %>%
mutate(total = sum(counts)) %>%
ungroup() %>%
arrange(species, gene_type, desc(total)) %>%
mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
ggplot(aes(x = gene_name, y = counts)) +
geom_point(aes(colour = sample_name)) +
scale_colour_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
# labs(colour = "Sample") +
labs(x = "Gene name", y = "log(TPM+1)", colour = "Sample") +
facet_wrap(~sample_name) +
scale_y_continuous(trans = "log1p")
gene_annotation %>%
filter(gene_type == "human_globin") %>%
left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
pivot_longer(
cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
names_to = "sample",
values_to = "counts"
) %>%
left_join(samples, by = "sample") %>%
group_by(gene_id) %>%
mutate(total = sum(counts)) %>%
ungroup() %>%
arrange(species, gene_type, desc(total)) %>%
mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
ggplot(aes(x = gene_name, y = counts)) +
geom_point(aes(colour = sample)) +
scale_colour_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(x = "Gene name", y = "TPM", colour = "Sample") +
facet_wrap(~sample_name) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale()))
gene_annotation %>%
filter(gene_type == "human_rRNA") %>%
left_join(gene_tpm_filtered[1:18], by = c("gene_id", "gene_name")) %>%
filter(rowSums(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`)) != 0) %>%
pivot_longer(
cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`,
names_to = "sample",
values_to = "counts"
) %>%
left_join(samples, by = "sample") %>%
group_by(gene_id) %>%
mutate(total = sum(counts)) %>%
ungroup() %>%
arrange(species, gene_type, desc(total)) %>%
mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
ggplot(aes(x = gene_name, y = counts)) +
geom_point(aes(colour = sample)) +
scale_colour_viridis(option = "viridis", discrete = T) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1)) +
coord_flip() +
scale_x_discrete(limits = rev) +
labs(x = "Gene name", y = "TPM", colour = "Sample") +
facet_wrap(~sample_name) +
scale_y_continuous(trans = "log1p")
TPM values appear to be the most appropriate metric to use.
These gene counts were aggregated across all samples (tximport of transcript counts) and filtered to only genes with a minimum count of 10 reads for at least 1 sample.
gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of gene counts", fill = "RNA type")
Filtered to only genes with a minimum count of 10 reads for at least 1 sample.
gene_tpm_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of TPM counts", fill = "RNA type")
All gene biotypes:
gene_tpm_filtered_long %>%
left_join(samples, by = "sample") %>%
mutate(gene_type = case_when(species == "pk" ~ gene_biotype, TRUE ~ gene_type)) %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
# scale_fill_viridis(
# option = "viridis", discrete = T,
# limits = rev,
# labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
# ) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of TPM counts", fill = "RNA type")
These gene counts were aggregated across all samples (tximport of transcript counts) and filtered to only genes with a minimum count of 10 reads for at least 1 sample.
# absolute numbers
gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, species) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of gene counts", fill = "Species")
# proportion
gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, species) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
geom_bar(position = "fill", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Proportion of gene counts", fill = "Species")
Note that since TPM is already scaled by library size, the absolute and proportion plots are identical.
# absolute numbers
gene_tpm_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, species) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of TPM counts", fill = "Species")
# proportion
gene_tpm_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, species) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = species)) +
geom_bar(position = "fill", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Proportion of TPM counts", fill = "Species")
# create binary matrices with presence/absence of genes
gene_tpm_filtered_binary <- gene_tpm_filtered %>%
mutate(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`, .fns = ~ .x > 0)) %>%
select(!c(gene_name, gene_type, species))
gene_tpm_pk_filtered_binary <- gene_tpm_filtered %>%
mutate(across(.cols = `104974-001-001_S1_L001`:`104974-001-016_S1_L001`, .fns = ~ .x > 0)) %>%
filter(species == "pk") %>%
select(!c(gene_name, gene_type, species))
gene_tpm_filtered_binary_list <- purrr::set_names(colnames(gene_tpm_filtered_binary %>% select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`))) %>%
purrr::imap(~ gene_tpm_filtered_binary$gene_id[gene_tpm_filtered_binary[, .x, drop = T]])
gene_tpm_pk_filtered_binary_list <- purrr::set_names(colnames(gene_tpm_pk_filtered_binary %>% select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`))) %>%
purrr::imap(~ gene_tpm_pk_filtered_binary$gene_id[gene_tpm_pk_filtered_binary[, .x, drop = T]])
comparison_centpip <- samples %>%
filter(WBC_depletion == "centpip") %>%
select(sample)
comparison_plasmo <- samples %>%
filter(WBC_depletion == "plasmo") %>%
select(sample)
comparison_pmacs <- samples %>%
filter(WBC_depletion == "pmacs") %>%
select(sample)
comparison_cellulose <- samples %>%
filter(WBC_depletion == "cellulose") %>%
select(sample)
comparison_nofilter <- samples %>%
filter(WBC_depletion == "nofilter") %>%
filter(!sample == "104974-001-006_S1_L001")
comparison_mrna_qiagen <- samples %>%
filter(kit == "mRNA_qiagen_FSglob")
comparison_trna_qiagen <- samples %>%
filter(kit == "tRNA_qiagen_FSglobH")
comparison_mrna_illumina <- samples %>%
filter(kit == "mRNA_illumina")
draw_venn <- function(set_list, names, title, output) {
VennDiagram::venn.diagram(
x = set_list,
category.names = names,
# Circles
lwd = 2,
# lty = "blank",
# col = palette.colors(3),
# fill = palette.colors(3),
# col = hcl.colors(3, palette = "viridis", alpha = 0.9),
# fill = hcl.colors(3, palette = "viridis", alpha = 0.4),
# col = hcl.colors(3, palette = "Set 2", alpha = 0.9),
# fill = hcl.colors(3, palette = "Set 2", alpha = 0.4),
col = hcl.colors(3, palette = "viridis", alpha = 0.9),
fill = hcl.colors(3, palette = "viridis", alpha = 0.4),
# Numbers
cex = .9,
fontface = "bold",
fontfamily = "sans",
# Set names
cat.cex = 0.9,
cat.fontface = "bold",
cat.default.pos = "outer",
cat.pos = c(-20, 20, 135),
cat.dist = c(0.055, 0.055, 0.11),
cat.fontfamily = "sans",
rotation = 1,
# title
main = title,
main.cex = 1.1,
main.fontface = "bold",
main.fontfamily = "sans",
# output features
height = 1200,
width = 1200,
resolution = 300,
compression = "lzw",
filename = output,
output = TRUE,
disable.logging = TRUE
)
}
Gene counts were filtered on a minimum (raw) count of 10 for at least 1 sample before checking for presence/absence.
v <- draw_venn(
gene_tpm_filtered_binary_list[comparison_centpip$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Centpip",
NULL
# here("results/figures/venn-centpip.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_filtered_binary_list[comparison_plasmo$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Plasmo",
NULL
# here("results/figures/venn-plasmo.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_filtered_binary_list[comparison_pmacs$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"PMACS",
NULL
# here("results/figures/venn-pmacs.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_filtered_binary_list[comparison_cellulose$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Cellulose",
NULL
# here("results/figures/venn-cellulose.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_filtered_binary_list[comparison_nofilter$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Cellulose",
NULL
# here("results/figures/venn-nofilter.png")
)
grid.newpage()
grid.draw(v)
Gene counts were filtered on a minimum (raw) count of 10 for at least 1 sample before checking for presence/absence.
v <- draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_centpip$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Centpip",
NULL
# here("results/figures/venn-centpip_pk.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_plasmo$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Plasmo",
NULL
# here("results/figures/venn-plasmo_pk.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_pmacs$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"PMACS",
NULL
# here("results/figures/venn-pmacs_pk.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_cellulose$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"Cellulose",
NULL
# here("results/figures/venn-cellulose_pk.png")
)
grid.newpage()
grid.draw(v)
v <- draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_nofilter$sample],
c("mRNA Qiagen FS globin", "tRNA Qiagen FS globin/rRNA", "mRNA Illumina"),
"No filter",
NULL
# here("results/figures/venn-nofilter_pk.png")
)
grid.newpage()
grid.draw(v)
# Use upset plots instead of five-way venn diagram (alternatively check out proportional area venn)
# mRNA Qiagen
UpSetR::upset(
as.data.frame(gene_tpm_filtered_binary %>%
select((comparison_mrna_qiagen %>% select(sample) %>% pull())) %>%
rename(all_of(c(
"no filter" = "104974-001-001_S1_L001",
"cent/pip" = "104974-001-002_S1_L001",
"plasmodipur" = "104974-001-003_S1_L001",
"PMACS" = "104974-001-004_S1_L001",
"cellulose" = "104974-001-005_S1_L001"
))) %>%
mutate_all(as.integer)),
keep.order = TRUE,
sets.x.label = "Genes per filter",
mainbar.y.label = "Intersections for mRNA Qiagen",
order.by = "freq"
)
# tRNA Qiagen
UpSetR::upset(
as.data.frame(gene_tpm_filtered_binary %>%
select((comparison_trna_qiagen %>% select(sample) %>% pull())) %>%
rename(all_of(c(
"no filter" = "104974-001-007_S1_L001",
"cent/pip" = "104974-001-008_S1_L001",
"plasmodipur" = "104974-001-009_S1_L001",
"PMACS" = "104974-001-010_S1_L001",
"cellulose" = "104974-001-011_S1_L001"
))) %>%
mutate_all(as.integer)),
keep.order = TRUE,
sets.x.label = "Genes per filter",
mainbar.y.label = "Intersections for tRNA Qiagen",
order.by = "freq"
)
# mRNA Illumina
UpSetR::upset(
as.data.frame(gene_tpm_filtered_binary %>%
select((comparison_mrna_illumina %>% select(sample) %>% pull())) %>%
rename(all_of(c(
"no filter" = "104974-001-012_S1_L001",
"cent/pip" = "104974-001-013_S1_L001",
"plasmodipur" = "104974-001-014_S1_L001",
"PMACS" = "104974-001-015_S1_L001",
"cellulose" = "104974-001-016_S1_L001"
))) %>%
mutate_all(as.integer)),
keep.order = TRUE,
sets.x.label = "Genes per filter",
mainbar.y.label = "Intersections for mRNA Illumina",
order.by = "freq"
)
# mRNA Qiagen
UpSetR::upset(
as.data.frame(gene_tpm_pk_filtered_binary %>%
select((comparison_mrna_qiagen %>% select(sample) %>% pull())) %>%
rename(all_of(c(
"no filter" = "104974-001-001_S1_L001",
"cent/pip" = "104974-001-002_S1_L001",
"plasmodipur" = "104974-001-003_S1_L001",
"PMACS" = "104974-001-004_S1_L001",
"cellulose" = "104974-001-005_S1_L001"
))) %>%
mutate_all(as.integer)),
keep.order = TRUE,
sets.x.label = "Genes per filter",
mainbar.y.label = "Intersections for mRNA Qiagen",
order.by = "freq"
)
# tRNA Qiagen
UpSetR::upset(
as.data.frame(gene_tpm_pk_filtered_binary %>%
select((comparison_trna_qiagen %>% select(sample) %>% pull())) %>%
rename(all_of(c(
"no filter" = "104974-001-007_S1_L001",
"cent/pip" = "104974-001-008_S1_L001",
"plasmodipur" = "104974-001-009_S1_L001",
"PMACS" = "104974-001-010_S1_L001",
"cellulose" = "104974-001-011_S1_L001"
))) %>%
mutate_all(as.integer)),
keep.order = TRUE,
sets.x.label = "Genes per filter",
mainbar.y.label = "Intersections for tRNA Qiagen",
order.by = "freq"
)
# mRNA Illumina
UpSetR::upset(
as.data.frame(gene_tpm_pk_filtered_binary %>%
select((comparison_mrna_illumina %>% select(sample) %>% pull())) %>%
rename(all_of(c(
"no filter" = "104974-001-012_S1_L001",
"cent/pip" = "104974-001-013_S1_L001",
"plasmodipur" = "104974-001-014_S1_L001",
"PMACS" = "104974-001-015_S1_L001",
"cellulose" = "104974-001-016_S1_L001"
))) %>%
mutate_all(as.integer)),
keep.order = TRUE,
sets.x.label = "Genes per filter",
mainbar.y.label = "Intersections for mRNA Illumina",
order.by = "freq"
)
Which samples are most similar to each other in terms of gene
expression profile? Use Euclidean distance between samples on
rlog transformed gene counts (filtered to genes with a
minimum of 10 reads per sample).
heatmap_of_distances <- function(transformed_data, title) {
sampleDists <- dist(t(assay(transformed_data)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- transformed_data$sample_short
colnames(sampleDistMatrix) <- NULL
# add grouping columns
anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
mutate(kit_name = as.factor(kit_name)) %>%
mutate(filter_name = as.factor(filter_name)) %>%
rename_with(~ str_to_title(str_replace(.x, "_", " ")))
# set names of samples
colnames(sampleDistMatrix) <- (transformed_data)$sample_short
rownames(anno) <- (transformed_data)$sample_short
# Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
mat_colors <- list(
`Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
`Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
)
names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)
# create heatmap
p <- pheatmap(sampleDistMatrix,
annotation_col = anno,
annotation_colors = mat_colors,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
color = viridis(
n = 256, alpha = 1,
begin = 0, end = 1, option = "viridis"
),
# main = "Sample-to-sample Euclidian distance based on all genes",
main = title,
scale = "none",
show_rownames = TRUE,
show_colnames=FALSE
) +
theme(plot.title = element_markdown())
return(p)
}
heatmap_of_distances(rld_filtered, str_wrap("Sample-to-sample Euclidian distance based on all genes - rlog", 40))
## NULL
The VST transform leads to slightly different results: the two large blocks remains the same, but the positioning of samples 5, 11 and 16 differs. Overall, since the samples are so similar overall, slight deviations in the clustering of the distance matrix are not too surprising.
heatmap_of_distances(vsd_filtered, str_wrap("Sample-to-sample Euclidian distance based on all genes - vst", 40))
## NULL
heatmap_of_distances(rld_filtered[rowData(rld_filtered)$species == "pk", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* genes - rlog before subset", 40))
## NULL
heatmap_of_distances(vsd_filtered[rowData(vsd_filtered)$species == "pk", ], str_wrap("Sample-to-sample Euclidian distance based on P. knowlesi genes - vst before subset", 40))
## NULL
heatmap_of_distances(rld_pk_filtered, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* genes - rlog after subset", 40))
## NULL
heatmap_of_distances(vsd_pk_filtered, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* genes - vst after subset", 40))
## NULL
heatmap_of_distances(rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi** protein-coding genes", 40))
## NULL
heatmap_of_distances(vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes", 40))
## NULL
heatmap_of_distances(rld_pk_filtered_coding, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes - rlog after subset", 40))
## NULL
heatmap_of_distances(vsd_pk_filtered_coding, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes - vst after subset", 40))
## NULL
heatmap_of_counts <- function(raw_data, transformed_data, top_type = "mean", top_n = 20, center = TRUE) {
# select n most highly expressed/variable genes
if (top_type == "mean") {
select <- order(rowMeans(counts(raw_data, normalized = FALSE)),
decreasing = TRUE
)[1:top_n]
} else if (top_type == "median") {
select <- order(rowMedians(counts(raw_data, normalized = FALSE)),
decreasing = TRUE
)[1:top_n]
} else if (top_type == "var") {
select <- head(order(rowVars(assay(transformed_data)), decreasing = TRUE), top_n)
}
# select transformed data and center if requested
mat <- assay(transformed_data)[select, ]
if (center == TRUE){
mat <- mat - rowMeans(mat)
}
# add grouping columns
anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
mutate(kit_name = as.factor(kit_name)) %>%
mutate(filter_name = as.factor(filter_name)) %>%
rename_with(~ str_to_title(str_replace(.x, "_", " ")))
# set names of samples
colnames(mat) <- (transformed_data)$sample_short
rownames(anno) <- (transformed_data)$sample_short
# Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
mat_colors <- list(
`Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
`Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
)
names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)
# create heatmap
pheatmap(mat,
annotation_col = anno,
annotation_colors = mat_colors,
color = viridis(n = 256, alpha = 1, begin = 0, end = 1, option = "viridis"),
# cluster_rows = FALSE,
# cluster_cols = FALSE,
show_rownames = FALSE,
)
# create table of genes
gene_annotation %>% filter(gene_id %in% rownames(mat))
}
This plot clusters the top n=20 most highly expressed genes (mean) centred on the gene’s mean expression level across all samples using the VST data.
Several of the top expressed genes are globin and rRNA genes (and most reads are human).
heatmap_of_counts(gse_filtered, vsd_filtered, "mean", 20, TRUE)
This plot cluster the top n=20 most highly expressed genes (mean) centred on the gene’s mean expression level across all samples using the rlog transformed data.
Several of the top expressed genes are globin and rRNA genes (and most reads are human).
heatmap_of_counts(gse_filtered, rld_filtered, "mean", 20, TRUE)
This plot clusters the top n=20 most highly expressed genes (median) centred on the gene’s mean expression level across all samples using the VST data.
Several of the top expressed genes are globin and rRNA genes (and most reads are human).
heatmap_of_counts(gse_filtered, vsd_filtered, "median", 20, TRUE)
This plot cluster the top n=20 most highly expressed genes (median) centred on the gene’s mean expression level across all samples using the rlog transformed data.
Several of the top expressed genes are globin and rRNA genes (and most reads are human).
heatmap_of_counts(gse_filtered, rld_filtered, "median", 20, TRUE)
This plot clusters the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the VST data.
Several of the top variable genes are human/Pk rRNA genes (and most reads are human).
heatmap_of_counts(gse_filtered, vsd_filtered, "var", 20, TRUE)
This plot cluster the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the rlog transformed data.
Several of the top variable genes are human genes (but globin and rRNA are rare).
heatmap_of_counts(gse_filtered, rld_filtered, "var", 20, TRUE)
This plot clusters the top n=20 most highly expressed genes centred on the gene’s mean expression level across all samples using the VST data.
The choice of transformation only has a minor effect on this subset of top-expressed genes.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding"],
"mean", 20, TRUE)
This plot cluster the top n=20 most highly expressed genes centred on the gene’s mean expression level across all samples using the rlog transformed data.
The choice of transformation only has a minor effect on this subset of top-expressed genes.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"],
"mean", 20, TRUE)
This plot clusters the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the VST data.
The choice of transformation only has a minor effect on this subset of top-variable genes.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding"],
"var", 20, TRUE)
This plot cluster the top n=20 most variable genes centred on the gene’s mean expression level across all samples using the rlog transformed data.
The choice of transformation only has a minor effect on this subset of top-variable genes.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"],
"var", 20, TRUE)
This plot cluster the top n=20 most highly expressed Pk protein coding genes (centred on the gene’s mean expression level) across all samples using the vst transformed data, without clustering. The data was transformed after subsetting.
heatmap_of_counts(gse_pk_filtered_coding,
vsd_pk_filtered_coding,
"mean", 20, TRUE)
This plot cluster the top n=20 most highly expressed Pk protein coding genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.
heatmap_of_counts(gse_pk_filtered_coding,
rld_pk_filtered_coding,
"mean", 20, TRUE)
This plot cluster the top n=20 most highly variable Pk protein coding genes across all samples using the vst transformed data, without clustering. The data was transformed after subsetting.
heatmap_of_counts(gse_pk_filtered_coding,
vsd_pk_filtered_coding,
"var", 20, TRUE)
This plot cluster the top n=20 most highly variable Pk protein coding genes across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.
heatmap_of_counts(gse_pk_filtered_coding,
rld_pk_filtered_coding,
"var", 20, TRUE)
This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering.
Several of the top expressed genes in the previous plot are rRNA genes, but not all of them.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
vsd_filtered[rowData(gse_filtered)$species == "pk"],
"mean", 20, TRUE)
This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
vsd_filtered[rowData(gse_filtered)$species == "pk"],
"mean", 20, TRUE)
This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
vsd_filtered[rowData(gse_filtered)$species == "pk"],
"var", 20, TRUE)
This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering.
heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
vsd_filtered[rowData(gse_filtered)$species == "pk"],
"var", 20, TRUE)
This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.
Several of the top expressed genes in the previous plot are rRNA genes, but not all of them.
heatmap_of_counts(gse_pk_filtered,
vsd_pk_filtered,
"mean", 20, TRUE)
This plot cluster the top n=20 most highly expressed Pk genes (centred on the gene’s mean expression level) across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.
Several of the top expressed genes in the previous plot are rRNA genes, but not all of them.
heatmap_of_counts(gse_pk_filtered,
rld_pk_filtered,
"mean", 20, TRUE)
This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.
heatmap_of_counts(gse_pk_filtered,
vsd_pk_filtered,
"var", 20, TRUE)
This plot cluster the top n=20 most highly variable Pk genes across all samples using the rlog transformed data, without clustering. The data was transformed after subsetting.
heatmap_of_counts(gse_pk_filtered,
rld_pk_filtered,
"var", 20, TRUE)
The following figure plots the (log-transformed) TPM of each protein-coding P. knowlesi gene (filtered on a minimum count of 10 for at least one sample) for all pairwise combinations of samples.
mat <- round(
cor(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding") %>%
select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`) %>%
rename_with(~samples %>% pull(sample_name) %>% as.character(), everything())
),
1)
corr <- mat
ggcorrplot(corr,
type = "lower",
method = "circle",
hc.order = FALSE,
outline.color = "white",
p.mat = cor_pmat(mat),
lab = TRUE,
legend.title = "Pearson correlation of TPM values") +
scale_fill_gradientn(colors = viridis(256, option = 'D'), limits = c(-1,1),
name = "Pearson correlation")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
correlation_plot <- ggpairs(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding"),
columns = 3:18,
columnLabels = samples$sample_short,
upper = list(continuous = wrap("cor", size = 5)),
) +
theme(
axis.text = element_text(size = 9),
axis.title = element_text(size = 9),
strip.text.x = element_text(size = 11),
strip.text.y = element_text(size = 10),
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 9)
)
# scale_y_discrete()
correlation_plot
correlation_plot_log <- ggpairs(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding"),
columns = 3:18, # `104974-001-001_S1_L001`:`104974-001-002_S1_L001`,
lower = list(continuous = "smooth"),
upper = list(continuous = wrap("cor", size = 5)),
columnLabels = samples$sample_short
) +
theme(
axis.text = element_text(size = 9),
axis.title = element_text(size = 9),
strip.text.x = element_text(size = 11),
strip.text.y = element_text(size = 10),
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 9)
) +
scale_x_continuous(trans = "log1p") +
scale_y_continuous(trans = "log1p")
correlation_plot_log
All PCA plots use the top 500 most variable genes and are filtered to only genes with a minimum count of 10 reads for at least one sample, unless otherwise stated.
pcaData <- plotPCA(rld_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog-transformed counts of human and ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Filter method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(vsd_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of VST counts of human and ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Filter method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(rld_filtered[rowData(rld_filtered)$species == "pk", ], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(vsd_filtered[rowData(vsd_filtered)$species == "pk", ], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(vsd_filtered[rowData(vsd_filtered)$species == "pk" & rowData(vsd_filtered)$gene_biotype == "protein_coding"], intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(rld_pk_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(vsd_pk_filtered, intgroup = c("kit_name", "filter_name"), returnData = TRUE)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
protein_gene_filter <- intersect(rownames(rowData(rld_filtered)), gene_annotation %>% filter(gene_biotype == "protein_coding" & species == "pk") %>% select(gene_id) %>% pull())
pcaData <- plotPCA(rld_filtered[protein_gene_filter, ], intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog-transformed counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
protein_gene_filter <- intersect(rownames(rowData(vsd_filtered)), gene_annotation %>% filter(gene_biotype == "protein_coding" & species == "pk") %>% select(gene_id) %>% pull())
pcaData <- plotPCA(vsd_filtered[protein_gene_filter, ], intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(rld_pk_filtered_coding, intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog counts of ", italic("P. knowlesi"), " protein coding genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
pcaData <- plotPCA(vsd_pk_filtered_coding, intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
## using ntop=500 top features by variance
percentVar <- round(100 * attr(pcaData, "percentVar"))
ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of VST counts of ", italic("P. knowlesi"), " protein coding genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
samples_order <- samples %>%
arrange(factor(samples$kit, levels = c("mRNA_illumina", "mRNA_qiagen_FSglobH", "mRNA_qiagen_FSglob", "tRNA_qiagen_FSglobH")),
factor(samples$WBC_depletion, levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose"))) %>%
select(sample_name)
p2_a <- gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, species) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
# arrange(factor(sample_name, levels = samples_order %>% pull())) %>%
mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>%
ggplot(aes(x = sum_reads, y = sample_name, fill = species)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
# labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
scale_y_discrete(limits = rev) +
scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(y = "Sample", x = "Sum of all gene counts", fill = "RNA type") +
theme_classic() +
ggtitle("A")
# print p2a to show y labels
p2_a
p2_a <- p2_a + theme(axis.title.y = element_blank(), axis.text.y = element_blank())
p2_b <- gene_tpm_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>%
ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_manual(
values = c(`human` = "#440154FF",
`human_globin` = "#3B528BFF",
`human_rRNA` = "#21908CFF",
`pk_rRNA` = "#5DC863FF",
`pk` = "#FDE725FF"),
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")),
expression(italic("P. knowlesi rRNA")))
) +
# scale_fill_viridis(
# option = "viridis", discrete = T,
# # limits = rev,
# labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")),
# expression(italic("P. knowlesi rRNA")))
# ) +
scale_y_discrete(limits = rev) +
scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(y = "Sample", x = "Sum of TPM counts", fill = "RNA type") +
theme_classic() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
ggtitle("B")
p2_c <- gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
filter(species == "pk" & gene_biotype == "protein_coding") %>%
group_by(sample_name,) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>%
ggplot(aes(x = sum_reads, y = sample_name)) +
geom_bar(position = "stack", stat = "identity", fill = "#FDE725FF") +
# scale_fill_manual(viridis::viridis(5)[4:5]) +
# scale_fill_manual(
# values = c(pk = "#5DC863FF", pk_rRNA = "#FDE725FF"),
# # scale_fill_viridis(
# # option = "viridis", discrete = T,
# # # limits = rev,
# labels = c(expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
# ) +
scale_y_discrete(limits = rev) +
scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(y = "Sample", x = str_wrap("Sum of *P. knowlesi* protein-coding gene counts", 10), fill = "RNA type") +
theme_classic() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank(), axis.title.x = ggtext::element_markdown()) +
ggtitle("C")
fig <- ggarrange(
p2_a, p2_b, p2_c,
ncol = 3,
legend = "bottom",
common.legend = TRUE,
legend.grob = get_legend(p2_b)
# widths = c(2,1,2)
)
fig
ggsave(filename = here("results/figures/gene_counts.svg"),
plot = fig, width=10, height=6)
p2_a <- gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of gene counts", fill = "RNA type") +
theme_classic() +
ggtitle("A")
# print p2a to show y labels
p2_a
p2_a <- p2_a + theme(axis.title.y = element_blank(), axis.text.y = element_blank())
p2_b <- gene_tpm_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = "Sum of TPM counts", fill = "RNA type") +
theme_classic() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
ggtitle("B")
p2_c <- gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
filter(species == "pk") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
ggplot(aes(y = sum_reads, x = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
# scale_fill_manual(viridis::viridis(5)[4:5]) +
scale_fill_manual(
values = c(pk = "#5DC863FF", pk_rRNA = "#FDE725FF"),
# scale_fill_viridis(
# option = "viridis", discrete = T,
# # limits = rev,
labels = c(expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
coord_flip() +
scale_x_discrete(limits = rev) +
scale_y_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(x = "Sample", y = expression(paste("Sum of ", italic("P. knowlesi"), " gene counts")), fill = "RNA type") +
theme_classic() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
ggtitle("C")
fig <- ggarrange(
p2_a, p2_b, p2_c,
ncol = 3,
common.legend = TRUE, legend = "bottom"
)
fig
samples_order <- samples %>%
arrange(factor(samples$kit, levels = c("mRNA_illumina", "mRNA_qiagen_FSglobH", "mRNA_qiagen_FSglob", "tRNA_qiagen_FSglobH")),
factor(samples$WBC_depletion, levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose"))) %>%
select(sample_name)
p2_a <- gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
# arrange(factor(sample_name, levels = samples_order %>% pull())) %>%
mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>%
ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
scale_y_discrete(limits = rev) +
scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(y = "Sample", x = "Sum of gene counts", fill = "RNA type") +
theme_classic() +
ggtitle("A")
# print p2a to show y labels
p2_a
p2_a <- p2_a + theme(axis.title.y = element_blank(), axis.text.y = element_blank())
p2_b <- gene_tpm_filtered_long %>%
left_join(samples, by = "sample") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>%
ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
scale_fill_viridis(
option = "viridis", discrete = T,
# limits = rev,
labels = c("human", "human globin", "human rRNA", expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
scale_y_discrete(limits = rev) +
scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(y = "Sample", x = "Sum of TPM counts", fill = "RNA type") +
theme_classic() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
ggtitle("B")
p2_c <- gene_counts_txi_filtered_long %>%
left_join(samples, by = "sample") %>%
filter(species == "pk") %>%
group_by(sample_name, gene_type) %>%
summarise(sum_reads = sum(counts, na.rm = TRUE)) %>%
mutate(per = 100 * sum_reads / sum(sum_reads)) %>%
mutate(sample_name = factor(sample_name, levels = samples_order %>% pull())) %>%
ggplot(aes(x = sum_reads, y = sample_name, fill = gene_type)) +
geom_bar(position = "stack", stat = "identity") +
# scale_fill_manual(viridis::viridis(5)[4:5]) +
scale_fill_manual(
values = c(pk = "#5DC863FF", pk_rRNA = "#FDE725FF"),
# scale_fill_viridis(
# option = "viridis", discrete = T,
# # limits = rev,
labels = c(expression(italic("P. knowlesi")), expression(italic("P. knowlesi rRNA")))
) +
scale_y_discrete(limits = rev) +
scale_x_continuous(labels = scales::label_number(scale_cut = cut_short_scale())) +
labs(y = "Sample", x = expression(paste("Sum of ", italic("P. knowlesi"), " gene counts")), fill = "RNA type") +
theme_classic() +
theme(axis.title.y = element_blank(), axis.text.y = element_blank()) +
ggtitle("C")
fig <- ggarrange(
p2_a, p2_b, p2_c,
ncol = 3,
common.legend = TRUE, legend = "bottom"
)
fig
heatmap_of_distances <- function(transformed_data, title) {
sampleDists <- dist(t(assay(transformed_data)))
sampleDistMatrix <- as.matrix(sampleDists)
rownames(sampleDistMatrix) <- transformed_data$sample_short
colnames(sampleDistMatrix) <- NULL
# add grouping columns
anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
mutate(kit_name = as.factor(kit_name)) %>%
mutate(filter_name = as.factor(filter_name)) %>%
rename_with(~ str_to_title(str_replace(.x, "_", " ")))
# set names of samples
colnames(sampleDistMatrix) <- (transformed_data)$sample_short
rownames(anno) <- (transformed_data)$sample_short
# Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
mat_colors <- list(
`Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
`Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
)
names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)
# create heatmap
p <- pheatmap(sampleDistMatrix,
annotation_col = anno,
annotation_colors = mat_colors,
clustering_distance_rows = sampleDists,
clustering_distance_cols = sampleDists,
color = viridis(
n = 256, alpha = 1,
begin = 0, end = 1, option = "viridis"
),
# main = "Sample-to-sample Euclidian distance based on all genes",
# main = title,
scale = "none",
show_rownames = TRUE,
show_colnames=FALSE
)
return(p)
}
p <- heatmap_of_distances(rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding", ], str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi** protein-coding genes", 40))
ggsave(filename = here("results/figures/genes_pk_coding_sample_dist_rlog.svg"),
plot = p)
p
p <- heatmap_of_distances(rld_pk_filtered_coding, str_wrap("Sample-to-sample Euclidian distance based on *P. knowlesi* protein-coding genes - rlog after subset", 40))
ggsave(filename = here("results/figures/genes_pk_coding_sample_dist_rlog_transform_after_subset.svg"),
plot = p)
p
heatmap_of_counts <- function(raw_data, transformed_data, top_type = "mean", top_n = 20, center = TRUE) {
# select n most highly expressed/variable genes
if (top_type == "mean") {
select <- order(rowMeans(counts(raw_data, normalized = FALSE)),
decreasing = TRUE
)[1:top_n]
} else if (top_type == "median") {
select <- order(rowMedians(counts(raw_data, normalized = FALSE)),
decreasing = TRUE
)[1:top_n]
} else if (top_type == "var") {
select <- head(order(rowVars(assay(transformed_data)), decreasing = TRUE), top_n)
}
# select transformed data and center if requested
mat <- assay(transformed_data)[select, ]
if (center == TRUE){
mat <- mat - rowMeans(mat)
}
# add grouping columns
anno <- as.data.frame(colData(transformed_data)[, c("kit_name", "filter_name")]) %>%
mutate(kit_name = as.factor(kit_name)) %>%
mutate(filter_name = as.factor(filter_name)) %>%
rename_with(~ str_to_title(str_replace(.x, "_", " ")))
# set names of samples
colnames(mat) <- (transformed_data)$sample_short
rownames(anno) <- (transformed_data)$sample_short
# Adapted from: https://slowkow.com/notes/pheatmap-tutorial/
mat_colors <- list(
`Kit Name` = hcl.colors(length(unique(anno$`Kit Name`)), palette = "plasma", alpha = 1),
`Filter Name` = palette.colors(length(unique(anno$`Filter Name`)), palette = "Okabe-Ito", alpha = 1)
)
names(mat_colors$`Kit Name`) <- unique(anno$`Kit Name`)
names(mat_colors$`Filter Name`) <- unique(anno$`Filter Name`)
# create heatmap
pheatmap(mat,
annotation_col = anno,
annotation_colors = mat_colors,
color = viridis(n = 256, alpha = 1, begin = 0, end = 1, option = "viridis"),
# cluster_rows = FALSE,
# cluster_cols = FALSE,
show_rownames = FALSE,
)
}
p <- heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk" & rowData(gse_filtered)$gene_biotype == "protein_coding"],
rld_filtered[rowData(rld_filtered)$species == "pk" & rowData(rld_filtered)$gene_biotype == "protein_coding"],
"var", 20, TRUE)
ggsave(filename = here("results/figures/genes_pk_coding_top_var_rlog.svg"),
plot = p)
p
p <- heatmap_of_counts(gse_pk_filtered_coding,
rld_pk_filtered_coding,
"var", 20, TRUE)
ggsave(filename = here("results/figures/genes_pk_coding_top_var_rlog_transform_after_subset.svg"),
plot = p)
p
p <- heatmap_of_counts(gse_filtered[rowData(gse_filtered)$species == "pk"],
vsd_filtered[rowData(gse_filtered)$species == "pk"],
"var", 20, TRUE)
ggsave(filename = here("results/figures/genes_pk_all_top_var_rlog.svg"),
plot = p)
p
p <- heatmap_of_counts(gse_pk_filtered,
rld_pk_filtered,
"var", 20, TRUE)
ggsave(filename = here("results/figures/genes_pk_all_top_var_rlog_transform_after_subset.svg"),
plot = p)
p
pcaData <- plotPCA(rld_pk_filtered_coding, intgroup = c("kit_name", "filter_name"), returnData = TRUE, ntop = 500)
percentVar <- round(100 * attr(pcaData, "percentVar"))
p <- ggplot(pcaData, aes(PC1, PC2, color = kit_name, shape = filter_name)) +
geom_point(size = 4) +
xlab(paste0("PC1: ", percentVar[1], "% variance")) +
ylab(paste0("PC2: ", percentVar[2], "% variance")) +
labs(
title = expression(paste("PCA of rlog counts of ", italic("P. knowlesi"), " protein coding genes")),
color = "Library preparation kit", shape = "Depletion method"
) +
coord_fixed() +
scale_color_viridis(discrete = TRUE) +
theme_bw()
ggsave(filename = here("results/figures/pca_pk_coding_rlog_transform_after_subset.svg"),
plot = p)
p
See previous section for in-line plots; the following code chunk only exports the relevant files.
draw_venn <- function(set_list, names, title, output) {
VennDiagram::venn.diagram(
x = set_list,
category.names = names,
# Circles
lwd = 2,
col = hcl.colors(3, palette = "viridis", alpha = 0.9),
fill = hcl.colors(3, palette = "viridis", alpha = 0.4),
# Numbers
cex = 1.4,
fontface = "bold",
fontfamily = "sans",
# Set names
cat.cex = 1.1,
cat.fontface = "bold",
cat.default.pos = "outer",
cat.pos = c(-15, 15, 125),
cat.dist = c(0.055, 0.055, 0.09),
cat.fontfamily = "sans",
rotation = 1,
# title
main = title,
main.cex = 1.4,
main.fontface = "bold",
main.fontfamily = "sans",
# output features
# height = 1400,
# width = 1400,
units = "px",
resolution = 600,
# compression = "lzw",
filename = output,
output = TRUE,
disable.logging = TRUE,
imagetype = "png"
)
}
p <- draw_venn(
gene_tpm_filtered_binary_list[comparison_centpip$sample],
# c("mRNA Qiagen FS globin", "total RNA Qiagen FS globin/rRNA", "mRNA Illumina"),
c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
"Cent/pip",
here("results/figures/venn-centpip.png")
)
draw_venn(
gene_tpm_filtered_binary_list[comparison_plasmo$sample],
c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
"Plasmo",
here("results/figures/venn-plasmo.png")
)
draw_venn(
gene_tpm_filtered_binary_list[comparison_pmacs$sample],
c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
"PMACS",
here("results/figures/venn-pmacs.png")
)
draw_venn(
gene_tpm_filtered_binary_list[comparison_cellulose$sample],
c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
"Cellulose",
here("results/figures/venn-cellulose.png")
)
draw_venn(
gene_tpm_filtered_binary_list[comparison_nofilter$sample],
c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
"Cellulose",
here("results/figures/venn-nofilter.png")
)
draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_centpip$sample],
c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
"Cent/pip",
here("results/figures/venn-centpip_pk.png")
)
draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_plasmo$sample],
c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
"Plasmo",
here("results/figures/venn-plasmo_pk.png")
)
draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_pmacs$sample],
c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
"PMACS",
here("results/figures/venn-pmacs_pk.png")
)
draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_cellulose$sample],
c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
"Cellulose",
here("results/figures/venn-cellulose_pk.png")
)
draw_venn(
gene_tpm_pk_filtered_binary_list[comparison_nofilter$sample],
c("Qiagen mRNA FSglob", "Qiagen total RNA FSglobH", "Illumina mRNA"),
"No filter",
here("results/figures/venn-nofilter_pk.png")
)
mat <- round(
cor(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding") %>%
select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`) %>%
rename_with(~samples %>% pull(sample_name) %>% as.character(), everything())
),
1)
corr <- mat
p <- ggcorrplot(corr,
type = "lower",
method = "circle",
hc.order = FALSE,
outline.color = "white",
p.mat = cor_pmat(mat),
lab = TRUE,
legend.title = "Pearson correlation of TPM values") +
scale_fill_gradientn(colors = viridis(256, option = 'D'), limits = c(-1,1), name = "Pearson correlation")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
ggsave(filename = here("results/figures/pk_coding_tpm_correlation.svg"),
plot = p,
width = 10, height = 10)
p
Ordered by mRNA - total RNA library preparation kit:
mat <- round(
cor(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding") %>%
select(`104974-001-001_S1_L001`:`104974-001-016_S1_L001`) %>%
rename_with(~samples %>% pull(sample_name) %>% as.character(), everything()) %>%
relocate(samples_order %>% pull())
),
1)
corr <- mat
p <- ggcorrplot(corr,
type = "lower",
method = "circle",
hc.order = FALSE,
outline.color = "white",
p.mat = cor_pmat(mat),
lab = TRUE,
legend.title = "Pearson correlation of TPM values") +
scale_fill_gradientn(colors = viridis(256, option = 'D'), limits = c(-1,1), name = "Pearson correlation")
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.
ggsave(filename = here("results/figures/pk_coding_tpm_correlation_ordered.svg"),
plot = p,
width = 10, height = 10)
p
Pk protein coding genes only:
correlation_plot
ggsave(filename = here("results/figures/correlation_protein_coding_genes_pk.png"),
plot = correlation_plot,
height = 4000, width = 6000, units = "px")
ggsave(filename = here("results/figures/correlation_protein_coding_genes_pk.svg"),
plot = correlation_plot,
height = 4000, width = 6000, units = "px")
# order by tRNA - mRNA
samples_order <- samples %>%
arrange(factor(samples$kit,
levels = c("mRNA_illumina", "mRNA_qiagen_FSglobH", "mRNA_qiagen_FSglob", "tRNA_qiagen_FSglobH")),
factor(samples$WBC_depletion,
levels = c("nofilter", "centpip", "plasmo", "pmacs", "cellulose")))
correlation_plot_ordered <- ggpairs(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding") %>%
relocate(
samples_order %>% select(sample) %>% pull()
),
columns = 1:16,
# columnLabels = samples_order %>% select(sample_short) %>% pull(),
columnLabels = samples_order %>% select(sample_short) %>% mutate(sample_short = factor(sample_short, levels = sample_short)) %>% pull(),
upper = list(continuous = wrap("cor", size = 5)),
) +
theme(
axis.text = element_text(size = 9),
axis.title = element_text(size = 9),
strip.text.x = element_text(size = 11),
strip.text.y = element_text(size = 10),
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 9)
)
# scale_y_discrete()
correlation_plot_ordered
ggsave(filename = here("results/figures/correlation_protein_coding_genes_pk_ordered.png"),
plot = correlation_plot_ordered,
height = 4000, width = 6000, units = "px")
ggsave(filename = here("results/figures/correlation_protein_coding_genes_pk_ordered.svg"),
plot = correlation_plot_ordered,
height = 4000, width = 6000, units = "px")
All Pk genes:
correlation_plot_log
ggsave(here("results/figures/correlation_protein_coding_genes_logscaled.png"),
plot = correlation_plot_log,
height = 4000, width = 6000, units = "px")
ggsave(here("results/figures/correlation_protein_coding_genes_logscaled.svg"),
plot = correlation_plot_log,
height = 4000, width = 6000, units = "px")
# order by tRNA-mRNA
correlation_plot_log_ordered <- ggpairs(
gene_tpm_filtered %>%
filter(species == "pk") %>%
filter(gene_biotype == "protein_coding") %>%
relocate(
samples_order %>% select(sample) %>% pull()
),
columns = 1:16, # `104974-001-001_S1_L001`:`104974-001-002_S1_L001`,
lower = list(continuous = "smooth"),
upper = list(continuous = wrap("cor", size = 5)),
columnLabels = samples_order %>% select(sample_short) %>% mutate(sample_short = factor(sample_short, levels = sample_short)) %>% pull(),,
) +
theme(
axis.text = element_text(size = 9),
axis.title = element_text(size = 9),
strip.text.x = element_text(size = 11),
strip.text.y = element_text(size = 10),
axis.text.x = element_text(size = 6),
axis.text.y = element_text(size = 9)
) +
scale_x_continuous(trans = "log1p") +
scale_y_continuous(trans = "log1p")
correlation_plot_log
ggsave(here("results/figures/correlation_protein_coding_genes_logscaled_ordered.png"),
plot = correlation_plot_log_ordered,
height = 4000, width = 6000, units = "px")
ggsave(here("results/figures/correlation_protein_coding_genes_logscaled_ordered.svg"),
plot = correlation_plot_log_ordered,
height = 4000, width = 6000, units = "px")